Load libraries

suppressMessages(library(tidyverse))
suppressMessages(library(plotly))
suppressMessages(library(rstatix)) # levene_test, kruskal_test, dunn_test, shapiro_test, wilcox_test

1) Read in the gapminder_clean.csv data as a tibble using read_csv

dat <- read_csv('gapminder_clean.csv') %>%
  select(-X1)
dim(dat)
## [1] 2607   19

The gapminder data consists of a matrix with 2607 rows and 20 columns.

2) Filter the data to include only rows where Year is 1962 and then make a scatter plot comparing ‘CO2 emissions (metric tons per capita)’ and gdpPercap for the filtered data

dat %>% 
  filter(Year == 1962) %>%
  # remove rows which contain NAs
  filter(!is.na(`CO2 emissions (metric tons per capita)`) & !is.na(gdpPercap)) %>%
  ggplot(aes(x = gdpPercap, y = `CO2 emissions (metric tons per capita)`)) +
  geom_point() +
  ylab("CO2 emissions (metric tons per capita)")

# Because of one very large outlier the analysis repreated with log transformed data

dat %>% 
  filter(Year == 1962) %>%
  # remove rows which contain NAs
  filter(!is.na(`CO2 emissions (metric tons per capita)`) & !is.na(gdpPercap)) %>%
  ggplot(aes(x = log(gdpPercap), y = log(`CO2 emissions (metric tons per capita)`))) +
  geom_point() +
  ylab("log(CO2 emissions (metric tons per capita))")

Log transformed data can be used to make data as “normal” as possible so that the statistical analysis results become more valid.

3) On the filtered data, calculate the pearson correlation of ‘CO2 emissions (metric tons per capita)’ and gdpPercap. What is the Pearson R value and associated p value?

dat %>%
  filter(Year == 1962) %>%
  # remove rows which contain NAs
  filter(!is.na(`CO2 emissions (metric tons per capita)`) & !is.na(gdpPercap)) %>%
  summarise(cor_coef = cor.test(gdpPercap, `CO2 emissions (metric tons per capita)`)$estimate,
            p_val = cor.test(gdpPercap, `CO2 emissions (metric tons per capita)`)$p.value)
## # A tibble: 1 x 2
##   cor_coef    p_val
##      <dbl>    <dbl>
## 1    0.926 1.13e-46

The Pearson R value of ‘CO2 emissions (metric tons per capita)’ and gdpPercap in Year 1962 is 0.9260817 and the associated p value is 1.128679e-46.

# Because of one very large outlier the analysis repreated with log transformed data

dat %>%
  filter(Year == 1962) %>%
  # remove rows which contain NAs
  filter(!is.na(`CO2 emissions (metric tons per capita)`) & !is.na(gdpPercap)) %>%
  summarise(cor_coef = cor.test(log(gdpPercap), log(`CO2 emissions (metric tons per capita)`))$estimate,
            p_val = cor.test(log(gdpPercap), log(`CO2 emissions (metric tons per capita)`))$p.value)
## # A tibble: 1 x 2
##   cor_coef    p_val
##      <dbl>    <dbl>
## 1    0.860 8.90e-33

The Pearson R value of log transformed ‘CO2 emissions (metric tons per capita)’ and gdpPercap in Year 1962 is 0.8602081 and the associated p value is 8.903567e-33.

4) On the unfiltered data, answer “In what year is the correlation between ‘CO2 emissions (metric tons per capita)’ and gdpPercap the strongest?” Filter the dataset to that year for the next step…

dat %>%
  # remove rows which contain NAs
  filter(!is.na(`CO2 emissions (metric tons per capita)`) & !is.na(gdpPercap)) %>%
  # group origial tibble depending on the year
  group_by(Year) %>%
  summarise(cor_coef = cor.test(log(gdpPercap), log(`CO2 emissions (metric tons per capita)`))$estimate,
            p_val = cor.test(log(gdpPercap), log(`CO2 emissions (metric tons per capita)`))$p.value) %>%
  arrange(desc(cor_coef))
## # A tibble: 10 x 3
##     Year cor_coef    p_val
##    <dbl>    <dbl>    <dbl>
##  1  2002    0.930 2.64e-55
##  2  2007    0.928 1.06e-55
##  3  1997    0.921 1.00e-51
##  4  1982    0.915 8.43e-47
##  5  1987    0.908 6.76e-45
##  6  1977    0.901 2.94e-43
##  7  1992    0.893 1.57e-43
##  8  1972    0.884 1.59e-39
##  9  1967    0.874 1.59e-36
## 10  1962    0.860 8.90e-33

In year 2002 is the correlation between ‘CO2 emissions (metric tons per capita)’ and gdpPercap the strongest.

5) Using plotly, create an interactive scatter plot comparing ‘CO2 emissions (metric tons per capita)’ and gdpPercap, where the point size is determined by pop (population) and the color is determined by the continent. You can easily convert any ggplot plot to a plotly plot using the ggplotly() command.

viz <- dat %>% 
    filter(Year == 2002) %>%
    # remove rows which contain NAs
    filter(!is.na(`CO2 emissions (metric tons per capita)`) & !is.na(gdpPercap)) %>%
    ggplot(aes(x = log(gdpPercap), y = log(`CO2 emissions (metric tons per capita)`), size = pop, color = continent)) +
   geom_point() + 
  ylab("log(CO2 emissions (metric tons per capita)")
ggplotly(viz)

6) What is the relationship between continent and ‘Energy use (kg of oil equivalent per capita)’?

First we must choose a suitable statistical test. We have continuous data and want to check for differences in mean for more than two groups. 

Therefore, we have to check if the parametric assumptions are satisfied. For ANOVA, we have four parametric assumptions that must be met:

6.1) Samples must be independent

dat %>%
  filter(continent != "") %>%
  # remove rows which contain NAs
  filter(!is.na(`Energy use (kg of oil equivalent per capita)`)) %>%
  ggplot(aes(x=continent, y=`Energy use (kg of oil equivalent per capita)`, fill = continent)) +
  geom_boxplot()

First, exploratory data analysis (EDA) was performed on the data. We can say that the samples are independent.

6.2) Population variances must be equal (Levene’s test)

dat %>%
  filter(continent != "") %>%
  # remove rows which contain NAs
  filter(!is.na(`Energy use (kg of oil equivalent per capita)`)) %>%
  # perform Levene’s test
  levene_test(`Energy use (kg of oil equivalent per capita)` ~ continent)
## # A tibble: 1 x 4
##     df1   df2 statistic        p
##   <int> <int>     <dbl>    <dbl>
## 1     4   843      12.4 8.00e-10

We reject the H0 hypothesis that the variances are equal based on the p value of our Levene’s test. Therefore, we can not use an ANOVA test and must use a Kruskal-Wallis test instead.

6.3) Kruskal-Wallis test

dat %>%
  filter(continent != "") %>%
  # remove rows which contain NAs
  filter(!is.na(`Energy use (kg of oil equivalent per capita)`)) %>%
  # perform Kruskal-Wallis test
  kruskal_test(`Energy use (kg of oil equivalent per capita)` ~ continent)
## # A tibble: 1 x 6
##   .y.                                     n statistic    df        p method     
## * <chr>                               <int>     <dbl> <int>    <dbl> <chr>      
## 1 Energy use (kg of oil equivalent p~   848      319.     4 1.01e-67 Kruskal-Wa~

The resulting p value of our Kruskal-Wallis test shows that there is a significant difference in the mean for our data. Because of our significant result, we additionally perform a dunn’s test.

dat %>%
  filter(continent != "") %>%
  # remove rows which contain NAs
  filter(!is.na(`Energy use (kg of oil equivalent per capita)`)) %>%
  dunn_test(`Energy use (kg of oil equivalent per capita)` ~ continent, p.adjust.method = "bonferroni")
## # A tibble: 10 x 9
##    .y.        group1 group2    n1    n2 statistic        p    p.adj p.adj.signif
##  * <chr>      <chr>  <chr>  <int> <int>     <dbl>    <dbl>    <dbl> <chr>       
##  1 Energy us~ Africa Ameri~   199   188      6.17 6.62e-10 6.62e- 9 ****        
##  2 Energy us~ Africa Asia     199   185      4.85 1.22e- 6 1.22e- 5 ****        
##  3 Energy us~ Africa Europe   199   256     16.4  1.10e-60 1.10e-59 ****        
##  4 Energy us~ Africa Ocean~   199    20      8.15 3.69e-16 3.69e-15 ****        
##  5 Energy us~ Ameri~ Asia     188   185     -1.28 2.01e- 1 1   e+ 0 ns          
##  6 Energy us~ Ameri~ Europe   188   256      9.63 5.92e-22 5.92e-21 ****        
##  7 Energy us~ Ameri~ Ocean~   188    20      5.46 4.86e- 8 4.86e- 7 ****        
##  8 Energy us~ Asia   Europe   185   256     11.0  6.04e-28 6.04e-27 ****        
##  9 Energy us~ Asia   Ocean~   185    20      6.01 1.80e- 9 1.80e- 8 ****        
## 10 Energy us~ Europe Ocean~   256    20      1.54 1.23e- 1 1   e+ 0 ns

7) Is there a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990?

Again, we must choose a suitable test first. We have continuous data and want to check for differences in mean for two groups. 

Therefore, we have to check if the parametric assumptions are satisfied. For Student’s unpaired t-test, we have four parametric assumptions that must be met:

7.1) The observations are sampled independently

dat %>%
  filter(Year > 1990 & (continent == "Europe" | continent == "Asia")) %>%
  # remove rows which contain NAs
  filter(!is.na(`Imports of goods and services (% of GDP)`)) %>%
  ggplot(aes(x=continent, y=`Imports of goods and services (% of GDP)`, fill = continent)) +
  geom_boxplot() +
  ylab("Imports of goods and services (% of GDP)")

First, exploratory data analysis (EDA) was performed on the data. We can say that the samples are independent. We can also say that the dependent variable is measured on the incremental level year and that independent variables consist of matched pairs.

7.2) The dependent variable is normally distributed (Shapiro-Wilk normality test)

dat %>%
  filter(Year > 1990 & (continent == "Europe" | continent == "Asia")) %>%
  # remove rows which contain NAs
  filter(!is.na(`Imports of goods and services (% of GDP)`)) %>%
  rename(., Imports = `Imports of goods and services (% of GDP)`) %>%
  # perform Shapiro-Wilk normality test
  shapiro_test(Imports)
## # A tibble: 1 x 3
##   variable statistic        p
##   <chr>        <dbl>    <dbl>
## 1 Imports      0.849 1.46e-13

We reject the H0 hypothesis that the data is normally distributed based on the p values of our Shapiro-Wilk normality test. Therefore, we can not use a Student’s unpaired t-test and must use a Wilcoxon Rank sums test instead.

7.3) Wilcoxon Rank sums test

dat %>%
  filter(Year > 1990 & (continent == "Europe" | continent == "Asia")) %>%
  # remove rows which contain NAs
  filter(!is.na(`Imports of goods and services (% of GDP)`)) %>%
  rename(., Imports = `Imports of goods and services (% of GDP)`) %>%
  # perform Wilcoxon Rank sums test
  wilcox_test(Imports ~ continent)
## # A tibble: 1 x 7
##   .y.     group1 group2    n1    n2 statistic     p
## * <chr>   <chr>  <chr>  <int> <int>     <dbl> <dbl>
## 1 Imports Asia   Europe    98   114      5707 0.787

We reject the H0 hypothesis that there is a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990 based on the p values of our Wilcoxon Rank sums test.

8) What is the country (or countries) that has the highest ‘Population density (people per sq. km of land area)’ across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)

First, exploratory data analysis (EDA) was performed on the data.

viz <- dat %>%
  # remove rows which contain NAs
  filter(!is.na(Year) & !is.na(`Country Name`) & !is.na(`Population density (people per sq. km of land area)`)) %>%
  ggplot(aes(x=Year, y=`Population density (people per sq. km of land area)`, group = `Country Name`)) + 
  geom_line() + 
  ylab("Population density (people per sq. km of land area)")

ggplotly(viz)
dat %>%
  # remove rows which contain NAs
  filter(!is.na(Year) & !is.na(`Country Name`) & !is.na(`Population density (people per sq. km of land area)`)) %>%
  group_by(Year) %>%
  arrange(Year, desc(`Population density (people per sq. km of land area)`)) %>%
  mutate(ranking = row_number()) %>%
  group_by(`Country Name`) %>%
  summarize(avg_ranking = mean(ranking)) %>%
  arrange(avg_ranking)
## # A tibble: 262 x 2
##    `Country Name`            avg_ranking
##    <chr>                           <dbl>
##  1 Macao SAR, China                  1.5
##  2 Monaco                            1.5
##  3 Hong Kong SAR, China              3.1
##  4 Singapore                         3.9
##  5 Gibraltar                         5  
##  6 Bermuda                           6.2
##  7 Malta                             7  
##  8 Bangladesh                        9.2
##  9 Channel Islands                   9.4
## 10 Sint Maarten (Dutch part)        10.5
## # ... with 252 more rows

The countries that have the highest ‘Population density (people per sq. km of land area)’ across all years are Macao SAR and Monaco.

9) What country (or countries) has shown the greatest increase in ‘Life expectancy at birth, total (years)’ since 1962?

First, exploratory data analysis (EDA) was performed on the data.

viz <- dat %>%
  # remove rows which contain NAs
  filter(!is.na(Year), !is.na(`Country Name`), !is.na(`Life expectancy at birth, total (years)`)) %>%
  ggplot(aes(x=Year, y=`Life expectancy at birth, total (years)`, group = `Country Name`)) +
  geom_line() + 
  ylab("Life expectancy at birth, total (years)")

ggplotly(viz)

To get the difference for Life expectancy at birth, total (years) for every country between 1962 and 2007 we subtract the value from 2007 minus the value from 1962.

dat %>%
  filter(Year == 1962 | Year == 2007) %>%
  # remove rows which contain NAs
  filter(!is.na(Year), !is.na(`Country Name`), !is.na(`Life expectancy at birth, total (years)`)) %>%
  group_by(`Country Name`) %>%
  # Calculate difference between 1962 and 2007
  mutate(diff = `Life expectancy at birth, total (years)` - lag(`Life expectancy at birth, total (years)`, default = 0)) %>% 
  select(`Country Name`, diff) %>%
  # keep only rows with calculated difference
  filter(row_number() %% 2 == 0) %>%
  # sort by difference
  arrange(desc(diff))
## # A tibble: 236 x 2
## # Groups:   Country Name [236]
##    `Country Name`      diff
##    <chr>              <dbl>
##  1 Maldives            36.9
##  2 Bhutan              33.2
##  3 Timor-Leste         31.1
##  4 Tunisia             30.9
##  5 Oman                30.8
##  6 Nepal               30.6
##  7 China               29.9
##  8 Yemen, Rep.         27.2
##  9 Saudi Arabia        26.7
## 10 Iran, Islamic Rep.  26.6
## # ... with 226 more rows

Country Maldives has shown the greatest increase in ‘Life expectancy at birth, total (years)’ since 1962.